E-Commerce Retail Store Data Analysis¶

Data Pre processing¶

In [267]:
data <- read.csv("data.csv", encoding="ISO-8859-1")
In [269]:
head(data)
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry
536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850 United Kingdom
536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850 United Kingdom
536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850 United Kingdom
536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE6 12/1/2010 8:26 3.39 17850 United Kingdom
536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850 United Kingdom
536365 22752 SET 7 BABUSHKA NESTING BOXES 2 12/1/2010 8:26 7.65 17850 United Kingdom
In [270]:
dim(data)
  1. 541909
  2. 8
In [271]:
length(unique(data$CustomerID))
4373
In [272]:
head(data[data$Quantity < 0, ])
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry
142C536379 D Discount -1 12/1/2010 9:41 27.50 14527 United Kingdom
155C536383 35004C SET OF 3 COLOURED FLYING DUCKS -1 12/1/2010 9:49 4.65 15311 United Kingdom
236C536391 22556 PLASTERS IN TIN CIRCUS PARADE -12 12/1/2010 10:24 1.65 17548 United Kingdom
237C536391 21984 PACK OF 12 PINK PAISLEY TISSUES -24 12/1/2010 10:24 0.29 17548 United Kingdom
238C536391 21983 PACK OF 12 BLUE PAISLEY TISSUES -24 12/1/2010 10:24 0.29 17548 United Kingdom
239C536391 21980 PACK OF 12 RED RETROSPOT TISSUES -24 12/1/2010 10:24 0.29 17548 United Kingdom
In [273]:
str(data)
'data.frame':	541909 obs. of  8 variables:
 $ InvoiceNo  : Factor w/ 25900 levels "536365","536366",..: 1 1 1 1 1 1 1 2 2 3 ...
 $ StockCode  : Factor w/ 4070 levels "10002","10080",..: 3538 2795 3045 2986 2985 1663 801 1548 1547 3306 ...
 $ Description: Factor w/ 4224 levels ""," 4 PURPLE FLOCK DINNER CANDLES",..: 4027 4035 932 1959 2980 3235 1573 1698 1695 259 ...
 $ Quantity   : int  6 6 8 6 6 2 6 6 6 32 ...
 $ InvoiceDate: Factor w/ 23260 levels "1/10/2011 10:04",..: 6839 6839 6839 6839 6839 6839 6839 6840 6840 6841 ...
 $ UnitPrice  : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
 $ CustomerID : int  17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
 $ Country    : Factor w/ 38 levels "Australia","Austria",..: 36 36 36 36 36 36 36 36 36 36 ...

Missing values calculation¶

In [274]:
null_percentage <- colMeans(is.na(data)) * 100
null_percentage
InvoiceNo
0
StockCode
0
Description
0
Quantity
0
InvoiceDate
0
UnitPrice
0
CustomerID
24.9266943342886
Country
0
In [275]:
colSums(is.na(data[, sapply(data, is.numeric)]))
Quantity
0
UnitPrice
0
CustomerID
135080
In [276]:
colSums(is.na(data[, sapply(data, is.factor)]))
InvoiceNo
0
StockCode
0
Description
0
InvoiceDate
0
Country
0

Summary statistics¶

In [277]:
numeric_data <- data[, sapply(data, is.numeric)]

# Generate summary statistics for integer and numeric variables
t(summary(numeric_data))
                                                                        
   Quantity  Min.   :-80995.00   1st Qu.:     1.00   Median :     3.00  
  UnitPrice  Min.   :-11062.06   1st Qu.:     1.25   Median :     2.08  
  CustomerID Min.   :12346       1st Qu.:13953       Median :15152      
                                                                        
   Quantity  Mean   :     9.55   3rd Qu.:    10.00   Max.   : 80995.00  
  UnitPrice  Mean   :     4.61   3rd Qu.:     4.13   Max.   : 38970.00  
  CustomerID Mean   :15288       3rd Qu.:16791       Max.   :18287      
                             
   Quantity                  
  UnitPrice                  
  CustomerID NA's   :135080  
In [278]:
categorical_data <- data[, sapply(data, is.factor)]

# Generate summary statistics for categorical variables
t(summary(categorical_data))
                                                                                   
  InvoiceNo                             573585 :  1114                             
  StockCode                             85123A :  2313                             
                            Description WHITE HANGING HEART T-LIGHT HOLDER:  2369  
          InvoiceDate                   10/31/2011 14:41:  1114                    
          Country                       United Kingdom:495478                      
                                                                                   
  InvoiceNo                             581219 :   749                             
  StockCode                             22423  :  2203                             
                            Description REGENCY CAKESTAND 3 TIER          :  2200  
          InvoiceDate                   12/8/2011 9:28  :   749                    
          Country                       Germany       :  9495                      
                                                                                   
  InvoiceNo                             581492 :   731                             
  StockCode                             85099B :  2159                             
                            Description JUMBO BAG RED RETROSPOT           :  2159  
          InvoiceDate                   12/9/2011 10:03 :   731                    
          Country                       France        :  8557                      
                                                                                   
  InvoiceNo                             580729 :   721                             
  StockCode                             47566  :  1727                             
                            Description PARTY BUNTING                     :  1727  
          InvoiceDate                   12/5/2011 17:24 :   721                    
          Country                       EIRE          :  8196                      
                                                                                   
  InvoiceNo                             558475 :   705                             
  StockCode                             20725  :  1639                             
                            Description LUNCH BAG RED RETROSPOT           :  1638  
          InvoiceDate                   6/29/2011 15:58 :   705                    
          Country                       Spain         :  2533                      
                                                                                   
  InvoiceNo                             579777 :   687                             
  StockCode                             84879  :  1502                             
                            Description ASSORTED COLOUR BIRD ORNAMENT     :  1501  
          InvoiceDate                   11/30/2011 15:13:   687                    
          Country                       Netherlands   :  2371                      
                                                                                   
  InvoiceNo                             (Other):537202                             
  StockCode                             (Other):530366                             
                            Description (Other)                           :530315  
          InvoiceDate                   (Other)         :537202                    
          Country                       (Other)       : 15279                      

The proportions of transactions originating from each country in the dataset, providing insights into the geographical distribution of sales.¶

In [279]:
country_proportions <- prop.table(table(data$Country))

# Format the proportions as decimals
formatted_proportions <- format(country_proportions, scientific = FALSE)
In [280]:
formatted_proportions
Australia
'0.00232326830'
Austria
'0.00073997664'
Bahrain
'0.00003506124'
Belgium
'0.00381798420'
Brazil
'0.00005905050'
Canada
'0.00027864457'
Channel Islands
'0.00139875883'
Cyprus
'0.00114779419'
Czech Republic
'0.00005535985'
Denmark
'0.00071783270'
EIRE
'0.01512431054'
European Community
'0.00011256502'
Finland
'0.00128250315'
France
'0.01579047405'
Germany
'0.01752139197'
Greece
'0.00026941793'
Hong Kong
'0.00053145454'
Iceland
'0.00033584975'
Israel
'0.00054806250'
Italy
'0.00148179860'
Japan
'0.00066062752'
Lebanon
'0.00008303977'
Lithuania
'0.00006458649'
Malta
'0.00023435669'
Netherlands
'0.00437527334'
Norway
'0.00200402651'
Poland
'0.00062925694'
Portugal
'0.00280305365'
RSA
'0.00010702904'
Saudi Arabia
'0.00001845328'
Singapore
'0.00042258017'
Spain
'0.00467421652'
Sweden
'0.00085254166'
Switzerland
'0.00369434721'
United Arab Emirates
'0.00012548232'
United Kingdom
'0.91431956288'
Unspecified
'0.00082301641'
USA
'0.00053699053'

Convert "Description" to character type and remove extra whitespaces¶

In [282]:
data$InvoiceNo <- as.character(data$InvoiceNo)
data$Description <- as.character(data$Description)
data$Description <- trimws(data$Description)

head(data)
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry
536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850 United Kingdom
536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850 United Kingdom
536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850 United Kingdom
536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE6 12/1/2010 8:26 3.39 17850 United Kingdom
536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850 United Kingdom
536365 22752 SET 7 BABUSHKA NESTING BOXES 2 12/1/2010 8:26 7.65 17850 United Kingdom
In [283]:
str(data)
'data.frame':	541909 obs. of  8 variables:
 $ InvoiceNo  : chr  "536365" "536365" "536365" "536365" ...
 $ StockCode  : Factor w/ 4070 levels "10002","10080",..: 3538 2795 3045 2986 2985 1663 801 1548 1547 3306 ...
 $ Description: chr  "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
 $ Quantity   : int  6 6 8 6 6 2 6 6 6 32 ...
 $ InvoiceDate: Factor w/ 23260 levels "1/10/2011 10:04",..: 6839 6839 6839 6839 6839 6839 6839 6840 6840 6841 ...
 $ UnitPrice  : num  2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
 $ CustomerID : int  17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
 $ Country    : Factor w/ 38 levels "Australia","Austria",..: 36 36 36 36 36 36 36 36 36 36 ...

Remove missing values¶

In [284]:
data <- data[!is.na(data$CustomerID), ]
In [285]:
colMeans(is.na(data)) * 100
InvoiceNo
0
StockCode
0
Description
0
Quantity
0
InvoiceDate
0
UnitPrice
0
CustomerID
0
Country
0

To accurately understand customer behavior, we need complete data on who the customers are. So, we're removing rows where we don't have customer IDs. This helps keep our analysis accurate and ensures we're not missing any information about the products they purchased.

Identifying and removing duplicates¶

In [286]:
sum(duplicated(data))
5225
In [287]:
duplicates <- duplicated(data)
In [288]:
# Extract duplicate rows
duplicate_rows <- data[duplicates, ]
duplicate_rows
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry
518536409 21866 UNION JACK FLAG LUGGAGE TAG 1 12/1/2010 11:45 1.25 17908 United Kingdom
528536409 22866 HAND WARMER SCOTTY DOG DESIGN 1 12/1/2010 11:45 2.10 17908 United Kingdom
538536409 22900 SET 2 TEA TOWELS I LOVE LONDON 1 12/1/2010 11:45 2.95 17908 United Kingdom
540536409 22111 SCOTTIE DOG HOT WATER BOTTLE 1 12/1/2010 11:45 4.95 17908 United Kingdom
556536412 22327 ROUND SNACK BOXES SET OF 4 SKULLS 1 12/1/2010 11:49 2.95 17920 United Kingdom
588536412 22273 FELTCRAFT DOLL MOLLY 1 12/1/2010 11:49 2.95 17920 United Kingdom
590536412 22749 FELTCRAFT PRINCESS CHARLOTTE DOLL 1 12/1/2010 11:49 3.75 17920 United Kingdom
595536412 22141 CHRISTMAS CRAFT TREE TOP ANGEL 1 12/1/2010 11:49 2.10 17920 United Kingdom
599536412 21448 12 DAISY PEGS IN WOOD BOX 1 12/1/2010 11:49 1.65 17920 United Kingdom
601536412 22569 FELTCRAFT CUSHION BUTTERFLY 2 12/1/2010 11:49 3.75 17920 United Kingdom
602536412 21448 12 DAISY PEGS IN WOOD BOX 2 12/1/2010 11:49 1.65 17920 United Kingdom
605536412 21448 12 DAISY PEGS IN WOOD BOX 2 12/1/2010 11:49 1.65 17920 United Kingdom
606536412 22902 TOTE BAG I LOVE LONDON 7 12/1/2010 11:49 2.10 17920 United Kingdom
617536412 21708 FOLDING UMBRELLA CREAM POLKADOT 1 12/1/2010 11:49 4.95 17920 United Kingdom
618536412 22900 SET 2 TEA TOWELS I LOVE LONDON 2 12/1/2010 11:49 2.95 17920 United Kingdom
619536412 21706 FOLDING UMBRELLA RED/WHITE POLKADOT1 12/1/2010 11:49 4.95 17920 United Kingdom
621536412 85184C S/4 VALENTINE DECOUPAGE HEART BOX 1 12/1/2010 11:49 2.95 17920 United Kingdom
759536446 21651 HANGING GLASS ETCHED TEALIGHT 6 12/1/2010 12:15 1.65 15983 United Kingdom
832536464 22866 HAND WARMER SCOTTY DOG DESIGN 1 12/1/2010 12:23 2.10 17968 United Kingdom
836536464 22945 CHRISTMAS METAL TAGS ASSORTED 6 12/1/2010 12:23 0.85 17968 United Kingdom
853536464 21992 VINTAGE PAISLEY STATIONERY SET 1 12/1/2010 12:23 2.95 17968 United Kingdom
860536464 22866 HAND WARMER SCOTTY DOG DESIGN 1 12/1/2010 12:23 2.10 17968 United Kingdom
892536488 84347 ROTATING SILVER ANGELS T-LIGHT HLDR1 12/1/2010 12:31 2.55 17897 United Kingdom
1062536522 21121 SET/10 RED POLKADOT PARTY CANDLES 1 12/1/2010 12:49 1.25 15012 United Kingdom
1068536522 21122 SET/10 PINK POLKADOT PARTY CANDLES 1 12/1/2010 12:49 1.25 15012 United Kingdom
1141536528 22865 HAND WARMER OWL DESIGN 1 12/1/2010 13:17 2.10 15525 United Kingdom
1154536528 85114B IVORY ENCHANTED FOREST PLACEMAT 1 12/1/2010 13:17 1.65 15525 United Kingdom
1160536528 22584 PACK OF 6 PANNETONE GIFT BOXES 2 12/1/2010 13:17 2.55 15525 United Kingdom
1166536528 22911 PAPER CHAIN KIT LONDON 1 12/1/2010 13:17 2.95 15525 United Kingdom
1167536528 22743 MAKE YOUR OWN FLOWERPOWER CARD KIT 1 12/1/2010 13:17 2.95 15525 United Kingdom
...........................
537818581352 22481 BLACK TEA TOWEL CLASSIC DESIGN 2 12/8/2011 12:26 0.39 14698 United Kingdom
537820581352 82494L WOODEN FRAME ANTIQUE WHITE 1 12/8/2011 12:26 2.95 14698 United Kingdom
537825581352 22564 ALPHABET STENCIL CRAFT 1 12/8/2011 12:26 1.25 14698 United Kingdom
538295581404 22469 HEART OF WICKER SMALL 2 12/8/2011 13:47 1.65 13680 United Kingdom
538377581405 22500 SET OF 2 TINS JARDIN DE PROVENCE 1 12/8/2011 13:50 1.25 13521 United Kingdom
538467581405 20975 12 PENCILS SMALL TUBE RED RETROSPOT 1 12/8/2011 13:50 0.65 13521 United Kingdom
538495581405 23212 HEART WREATH DECORATION WITH BELL 1 12/8/2011 13:50 1.25 13521 United Kingdom
538627581412 21481 FAWN BLUE HOT WATER BOTTLE 1 12/8/2011 14:38 3.75 14415 United Kingdom
538678581412 22199 FRYING PAN RED RETROSPOT 1 12/8/2011 14:38 1.25 14415 United Kingdom
538752581414 22326 ROUND SNACK BOXES SET OF4 WOODLAND 1 12/8/2011 14:39 2.95 14730 United Kingdom
538757581414 22094 RED RETROSPOT TISSUE BOX 1 12/8/2011 14:39 0.39 14730 United Kingdom
538763581414 22379 RECYCLING BAG RETROSPOT 1 12/8/2011 14:39 2.10 14730 United Kingdom
538772581414 23291 DOLLY GIRL CHILDRENS CUP 1 12/8/2011 14:39 1.25 14730 United Kingdom
538777581414 22327 ROUND SNACK BOXES SET OF 4 SKULLS 1 12/8/2011 14:39 2.95 14730 United Kingdom
538782581414 22327 ROUND SNACK BOXES SET OF 4 SKULLS 1 12/8/2011 14:39 2.95 14730 United Kingdom
538786581414 23454 THREE MINI HANGING FRAMES 1 12/8/2011 14:39 4.15 14730 United Kingdom
538943581425 85152 HAND OVER THE CHOCOLATE SIGN 12 12/8/2011 15:31 2.10 14796 United Kingdom
538956581425 21231 SWEETHEART CERAMIC TRINKET BOX 4 12/8/2011 15:31 1.25 14796 United Kingdom
538971581425 23178 JAM CLOCK MAGNET 1 12/8/2011 15:31 1.25 14796 United Kingdom
539893581449 22423 REGENCY CAKESTAND 3 TIER 1 12/8/2011 17:37 12.75 12748 United Kingdom
539927581450 22118 JOY WOODEN BLOCK LETTERS 1 12/8/2011 17:54 1.25 16794 United Kingdom
540033581456 35964 FOLKART CLIP ON STARS 2 12/8/2011 18:42 0.39 17530 United Kingdom
540199581471 21411 GINGHAM HEART DOORSTOP RED 2 12/8/2011 19:29 1.95 14702 United Kingdom
541612581514 22075 6 RIBBONS ELEGANT CHRISTMAS 24 12/9/2011 11:20 0.39 17754 United Kingdom
541656581538 23275 SET OF 3 HANGING OWLS OLLIE BEAK 1 12/9/2011 11:34 1.25 14446 United Kingdom
541676581538 22068 BLACK PIRATE TREASURE CHEST 1 12/9/2011 11:34 0.39 14446 United Kingdom
541690581538 23318 BOX OF 6 MINI VINTAGE CRACKERS 1 12/9/2011 11:34 2.49 14446 United Kingdom
541693581538 22992 REVOLVER WOODEN RULER 1 12/9/2011 11:34 1.95 14446 United Kingdom
541700581538 22694 WICKER STAR 1 12/9/2011 11:34 2.10 14446 United Kingdom
541702581538 23343 JUMBO BAG VINTAGE CHRISTMAS 1 12/9/2011 11:34 2.08 14446 United Kingdom
In [289]:
data <- data[!duplicated(data), ]
# Reset row names
row.names(data) <- NULL

# Print the data frame after removing duplicates
head(data)
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry
536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850 United Kingdom
536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850 United Kingdom
536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850 United Kingdom
536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE6 12/1/2010 8:26 3.39 17850 United Kingdom
536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850 United Kingdom
536365 22752 SET 7 BABUSHKA NESTING BOXES 2 12/1/2010 8:26 7.65 17850 United Kingdom
In [290]:
dim(data)
  1. 401604
  2. 8

The presence of entirely identical rows, including identical transaction times, implies potential data recording errors rather than genuine repeated transactions. Removing these duplicate rows will enhance dataset cleanliness, improving accuracy in customer clustering based on purchasing behaviors

In [291]:
length(unique(data$CustomerID))
4372

Initial observation of the transaction dataset revealed negative values in the "Quantity" column, suggesting returned items or cancelled orders. These entries seem to correlate with invoice numbers beginning with the letter 'C', possibly indicating cancellations

In [293]:
data$is_Cancelled <- startsWith(data$InvoiceNo, "C")
In [294]:
head(data)
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountryis_Cancelled
536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850 United Kingdom FALSE
536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850 United Kingdom FALSE
536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850 United Kingdom FALSE
536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE6 12/1/2010 8:26 3.39 17850 United Kingdom FALSE
536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850 United Kingdom FALSE
536365 22752 SET 7 BABUSHKA NESTING BOXES 2 12/1/2010 8:26 7.65 17850 United Kingdom FALSE
In [295]:
prop.table(table(data$is_Cancelled))
     FALSE       TRUE 
0.97790859 0.02209141 
In [296]:
format(prop.table(table(data$is_Cancelled)), scientific=FALSE)
FALSE
'0.97790859'
TRUE
'0.02209141'

The percentage of cancelled transactions in the dataset is: 2.20%

In [297]:
cancelled_data <- data[data$is_Cancelled, ]

# Remove the "CustomerID" column
cancelled_data <- cancelled_data[, !names(cancelled_data) %in% "CustomerID"]

# Generate summary statistics for the filtered data frame
summary_data <- summary(cancelled_data)

# Transpose the summary statistics
summary_data_transposed <- t(summary_data)

# Print the transposed summary statistics
print(summary_data_transposed)
                                                                     
 InvoiceNo            Length:8872             Class :character       
  StockCode           22423  : 180            M      : 175           
Description           Length:8872             Class :character       
   Quantity           Min.   :-80995.00       1st Qu.:    -6.00      
          InvoiceDate 10/12/2011 16:17: 101   7/19/2011 12:26 :  57  
  UnitPrice           Min.   :    0.01        1st Qu.:    1.45       
          Country     United Kingdom:7501     Germany       : 453    
is_Cancelled          Mode:logical            TRUE:8872              
                                                                     
 InvoiceNo            Mode  :character                               
  StockCode           POST   :  97            22960  :  86           
Description           Mode  :character                               
   Quantity           Median :    -2.00       Mean   :   -30.77      
          InvoiceDate 3/31/2011 11:58 :  45   7/21/2011 13:00 :  40  
  UnitPrice           Median :    2.95        Mean   :   18.90       
          Country     EIRE          : 247     France        : 148    
is_Cancelled                                                         
                                                                     
 InvoiceNo                                                           
  StockCode           D      :  77            22720  :  72           
Description                                                          
   Quantity           3rd Qu.:    -1.00       Max.   :    -1.00      
          InvoiceDate 10/12/2011 13:15:  39   10/6/2011 19:51 :  36  
  UnitPrice           3rd Qu.:    4.95        Max.   :38970.00       
          Country     USA           : 112     Australia     :  74    
is_Cancelled                                                         
                                             
 InvoiceNo                                   
  StockCode           (Other):8185           
Description                                  
   Quantity                                  
          InvoiceDate (Other)         :8554  
  UnitPrice                                  
          Country     (Other)       : 337    
is_Cancelled                                 

Insights from the cancelled transactions data reveal all quantities as negative, indicating cancellations. The UnitPrice column shows a varied range of products involved. Retaining cancelled transactions allows for deeper analysis, potentially enhancing clustering and improving recommendation systems by understanding cancellation patterns

Exploratory Data Analysis¶

In [69]:
library(ggplot2)

# Calculate the interquartile range (IQR) for the Quantity column
Q1 <- quantile(data$Quantity, 0.02)
Q3 <- quantile(data$Quantity, 0.98)
IQR <- Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Filter out rows with Quantity values outside the bounds
data_filtered <- data[data$Quantity >= lower_bound & data$Quantity <= upper_bound, ]

# Create a histogram of Quantity with the filtered data
ggplot(data_filtered, aes(x = Quantity)) +
  geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Quantity (Filtered)", x = "Quantity", y = "Frequency")
No description has been provided for this image
In [36]:
library(ggplot2)

# Create a box plot of the "Quantity" column
ggplot(data, aes(y = Quantity)) +
  geom_boxplot(notch = TRUE) +
  labs(title = "Box Plot of Quantity")
No description has been provided for this image
In [298]:
# Filter rows where is_Cancelled is FALSE
filtered_data <- data[!data$is_Cancelled, ]

# Remove the "CustomerID" column
filtered_data <- subset(filtered_data, select = -CustomerID)

# Transpose the summary statistics
summary_data_transposed <- t(summary(filtered_data))

# Print the transposed summary statistics
print(summary_data_transposed)
                                                                         
 InvoiceNo            Length:392732             Class :character         
  StockCode           85123A :  2023            22423  :  1714           
Description           Length:392732             Class :character         
   Quantity           Min.   :    1.00          1st Qu.:    2.00         
          InvoiceDate 11/14/2011 15:27:   542   11/28/2011 15:54:   533  
  UnitPrice           Min.   :   0.000          1st Qu.:   1.250         
          Country     United Kingdom:349227     Germany       :  9027    
is_Cancelled          Mode :logical             FALSE:392732             
                                                                         
 InvoiceNo            Mode  :character                                   
  StockCode           85099B :  1615            84879  :  1395           
Description           Mode  :character                                   
   Quantity           Median :    6.00          Mean   :   13.15         
          InvoiceDate 12/5/2011 17:17 :   529   11/23/2011 13:39:   443  
  UnitPrice           Median :   1.950          Mean   :   3.126         
          Country     France        :  8327     EIRE          :  7228    
is_Cancelled                                                             
                                                                         
 InvoiceNo                                                               
  StockCode           47566  :  1390            20725  :  1304           
Description                                                              
   Quantity           3rd Qu.:   12.00          Max.   :80995.00         
          InvoiceDate 10/31/2011 14:09:   435   9/21/2011 14:40 :   421  
  UnitPrice           3rd Qu.:   3.750          Max.   :8142.750         
          Country     Spain         :  2480     Netherlands   :  2363    
is_Cancelled                                                             
                                               
 InvoiceNo                                     
  StockCode           (Other):383291           
Description                                    
   Quantity                                    
          InvoiceDate (Other)         :389829  
  UnitPrice                                    
          Country     (Other)       : 14080    
is_Cancelled                                   
In [38]:
non_cancelled_data <- subset(data, !is_Cancelled)

# Create a box plot of the "Quantity" column
ggplot(non_cancelled_data, aes(y = Quantity)) +
  geom_boxplot(notch = TRUE) +
  labs(title = "Box Plot of Quantity")
No description has been provided for this image

The box plot analysis of Quantity indicates a right-skewed distribution with a median suggesting most orders consist of small quantities. While the majority fall within a narrow interquartile range (IQR), outliers representing extremely large orders are present, possibly indicating bulk purchases or data anomalies. Overall, the platform tends to handle small to moderate quantity orders, with rare outliers potentially offering insights into product success or data integrity issues.ordering system.

In [39]:
# Load the ggplot2 and scales packages
library(ggplot2)
library(scales)

# Filter the data frame to include only non-cancelled orders
non_cancelled_data <- subset(data, !is_Cancelled)

# Create a box plot of the "Quantity" column with logarithmic scale on the y-axis
ggplot(non_cancelled_data, aes(y = Quantity)) +
  geom_boxplot(notch = TRUE) +
  scale_y_continuous(trans = "log10") +
  labs(title = "Box Plot of Quantity")
No description has been provided for this image

Feature engineering¶

In [299]:
length(unique(data$StockCode))
3684
In [300]:
# Load the stringr package
library(stringr)

# Create a new column "len_StockCode" to store the length of trimmed "StockCode" values
data$len_StockCode <- nchar(str_trim(data$StockCode))

# Print the modified data frame
head(data)
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountryis_Cancelledlen_StockCode
536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850 United Kingdom FALSE 6
536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850 United Kingdom FALSE 5
536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850 United Kingdom FALSE 6
536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE6 12/1/2010 8:26 3.39 17850 United Kingdom FALSE 6
536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850 United Kingdom FALSE 6
536365 22752 SET 7 BABUSHKA NESTING BOXES 2 12/1/2010 8:26 7.65 17850 United Kingdom FALSE 5
In [301]:
format(prop.table(table(data$len_StockCode)), scientific=FALSE)
1
'0.00133713808'
2
'0.00033366202'
3
'0.00003984024'
4
'0.00302785829'
5
'0.91179121722'
6
'0.08270335953'
7
'0.00073704445'
12
'0.00002988018'

The majority of stock codes adhere to a standard format of five numeric characters, while anomalies such as non-numeric and single-digit codes are present but rare. These anomalies, comprising only 0.48% of the dataset, likely represent non-product transactions and their inclusion may introduce noise in subsequent analyses. To ensure focus on genuine product transactions and improve accuracy, it's recommended to filter out rows with anomalous stock codes before further analysis and model development.

In [302]:
num_unique_stockcodes <- length(unique(data[data$len_StockCode == 5, "StockCode"]))

# Print the number of unique stock codes
print(num_unique_stockcodes)
[1] 2798
In [303]:
filtered_data <- data[data$len_StockCode < 5, ]

# Print the filtered data frame
filtered_data
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountryis_Cancelledlen_StockCode
46536370 POST POSTAGE 3 12/1/2010 8:45 18.00 12583 France FALSE 4
142C536379 D Discount -1 12/1/2010 9:41 27.50 14527 United Kingdom TRUE 1
387536403 POST POSTAGE 1 12/1/2010 11:2715.00 12791 Netherlands FALSE 4
1098536527 POST POSTAGE 1 12/1/2010 13:0418.00 12662 Germany FALSE 4
1387536540 C2 CARRIAGE 1 12/1/2010 14:0550.00 14911 EIRE FALSE 2
1655536569 M Manual 1 12/1/2010 15:35 1.25 16274 United Kingdom FALSE 1
1666536569 M Manual 1 12/1/2010 15:3518.95 16274 United Kingdom FALSE 1
3790536840 POST POSTAGE 1 12/2/2010 18:2718.00 12738 Germany FALSE 4
3969536852 POST POSTAGE 1 12/3/2010 9:51 18.00 12686 France FALSE 4
4036536858 POST POSTAGE 2 12/3/2010 10:3640.00 13520 Switzerland FALSE 4
4080536861 POST POSTAGE 3 12/3/2010 10:4418.00 12427 Germany FALSE 4
4411536967 POST POSTAGE 1 12/3/2010 12:5718.00 12600 Germany FALSE 4
4485536974 POST POSTAGE 2 12/3/2010 13:5918.00 12682 France FALSE 4
4607536981 M Manual 2 12/3/2010 14:26 0.85 14723 United Kingdom FALSE 1
4636536983 POST POSTAGE 1 12/3/2010 14:3018.00 12712 Germany FALSE 4
4802536990 POST POSTAGE 1 12/3/2010 15:1428.00 12793 Portugal FALSE 4
4865537022 POST POSTAGE 2 12/3/2010 15:4528.00 12725 Italy FALSE 4
4903537026 POST POSTAGE 2 12/3/2010 16:3518.00 12395 Belgium FALSE 4
5550537065 POST POSTAGE 9 12/5/2010 11:5718.00 12567 France FALSE 4
5573537077 M Manual 12 12/5/2010 11:59 0.42 17062 United Kingdom FALSE 1
6116537137 M Manual 36 12/5/2010 12:43 0.85 16327 United Kingdom FALSE 1
6173537140 M Manual 1 12/5/2010 12:53 0.42 12748 United Kingdom FALSE 1
6174537140 M Manual 1 12/5/2010 12:53 0.85 12748 United Kingdom FALSE 1
6597C537164 D Discount -1 12/5/2010 13:2129.29 14527 United Kingdom TRUE 1
6841537197 POST POSTAGE 3 12/5/2010 14:0218.00 12647 Germany FALSE 4
6846537198 POST POSTAGE 1 12/5/2010 14:0318.00 12647 Germany FALSE 4
6866537199 M Manual 1 12/5/2010 14:06 0.65 15894 United Kingdom FALSE 1
6977537201 POST POSTAGE 6 12/5/2010 14:1918.00 12472 Germany FALSE 4
7105537208 M Manual 4 12/5/2010 15:12 0.85 15889 United Kingdom FALSE 1
7257537212 POST POSTAGE 4 12/5/2010 15:2118.00 12720 Germany FALSE 4
.................................
393745580736 POST POSTAGE 2 12/6/2011 8:55 18.00 12716 France FALSE 4
393855580752 POST POSTAGE 3 12/6/2011 9:56 45.00 12478 Greece FALSE 4
393922580753 POST POSTAGE 3 12/6/2011 10:00 18.00 12682 France FALSE 4
393929580756 POST POSTAGE 1 12/6/2011 10:19 18.00 12723 France FALSE 4
394903580884 M Manual 1 12/6/2011 12:21 0.85 15907 United Kingdom FALSE 1
395434580955 POST POSTAGE 1 12/6/2011 14:22 28.00 12442 Spain FALSE 4
395435580956 M Manual 4 12/6/2011 14:23 1.25 17841 United Kingdom FALSE 1
395619C580957 POST POSTAGE -1 12/6/2011 14:23 4.50 12839 United Kingdom TRUE 4
395773580965 POST POSTAGE 3 12/6/2011 14:52 15.00 12417 Belgium FALSE 4
395919580979 POST POSTAGE 3 12/6/2011 15:40 18.00 12362 Belgium FALSE 4
396074580986 POST POSTAGE 4 12/6/2011 16:34 18.00 12650 France FALSE 4
396276581000 POST POSTAGE 5 12/7/2011 8:03 18.00 12720 Germany FALSE 4
396288581001 POST POSTAGE 3 12/7/2011 8:07 18.00 12583 France FALSE 4
396441C581009 M Manual -1 12/7/2011 9:15 125.00 16971 United Kingdom TRUE 1
397436C581145 M Manual -1 12/7/2011 13:48 9.95 17490 United Kingdom TRUE 1
397917581171 POST POSTAGE 2 12/7/2011 15:02 18.00 12615 France FALSE 4
398114581179 POST POSTAGE 1 12/7/2011 15:43240.00 12471 Germany FALSE 4
398159581182 POST POSTAGE 4 12/7/2011 15:56 28.00 12783 Portugal FALSE 4
398170581183 POST POSTAGE 4 12/7/2011 16:24 18.00 12569 Germany FALSE 4
398174581184 POST POSTAGE 2 12/7/2011 16:24 18.00 12569 Germany FALSE 4
398627581232 POST POSTAGE 4 12/8/2011 10:26 40.00 12358 Austria FALSE 4
398846581266 POST POSTAGE 5 12/8/2011 11:25 18.00 12621 Germany FALSE 4
398849581279 POST POSTAGE 3 12/8/2011 11:35 18.00 12437 France FALSE 4
399699581405 M Manual 3 12/8/2011 13:50 0.42 13521 United Kingdom FALSE 1
401178581493 POST POSTAGE 1 12/9/2011 10:10 15.00 12423 Belgium FALSE 4
401196581494 POST POSTAGE 2 12/9/2011 10:13 18.00 12518 Germany FALSE 4
401244C581499 M Manual -1 12/9/2011 10:28224.69 15498 United Kingdom TRUE 1
401426581570 POST POSTAGE 1 12/9/2011 11:59 18.00 12662 Germany FALSE 4
401463581574 POST POSTAGE 2 12/9/2011 12:09 18.00 12526 Germany FALSE 4
401464581578 POST POSTAGE 3 12/9/2011 12:16 18.00 12713 Germany FALSE 4
In [304]:
dim(filtered_data)
  1. 1903
  2. 10
In [305]:
filtered_data <- subset(data, len_StockCode < 5)

# Calculate the frequency of each unique value in the "StockCode" column and normalize
value_counts <- format(prop.table(table(filtered_data$StockCode)), scientific=FALSE)
In [306]:
value_counts_dict <- as.list(value_counts)

# Filter out values whose counts are greater than 0
filtered_counts_dict <- value_counts_dict[value_counts_dict > 0]

# Print the filtered dictionary
print(filtered_counts_dict)
$C2
[1] "0.070415134"

$CRUK
[1] "0.008407777"

$D
[1] "0.040462428"

$DOT
[1] "0.008407777"

$M
[1] "0.241723594"

$PADS
[1] "0.002101944"

$POST
[1] "0.628481345"

In [307]:
filtered_data <- subset(data, len_StockCode > 7)

# Calculate the frequency of each unique value in the "StockCode" column and normalize
value_counts <- table(filtered_data$StockCode)
value_counts_dict <- as.list(value_counts)

# Filter out values whose counts are greater than 0
filtered_counts_dict <- value_counts_dict[value_counts_dict > 0]

# Print the filtered dictionary
print(filtered_counts_dict)
$`BANK CHARGES`
[1] 12

In [308]:
filtered_data <- subset(data, len_StockCode >= 5 & len_StockCode < 8)

# Calculate the frequency of each unique value in the "StockCode" column and normalize
print(length(unique(filtered_data$StockCode)))
[1] 3676
In [309]:
dim(filtered_data)
  1. 399689
  2. 10
In [310]:
dim(data)
  1. 401604
  2. 10
In [311]:
# Total number of records
total_records <- nrow(data)

# Number of records with anomalous stock codes
anomalous_records <- total_records - nrow(filtered_data)

# Calculate the percentage of records with anomalous stock codes
percentage_anomalous <- (anomalous_records / total_records) * 100

# Print the percentage
print(paste("The percentage of records with anomalous stock codes in the dataset is:", round(percentage_anomalous, 2), "%"))
[1] "The percentage of records with anomalous stock codes in the dataset is: 0.48 %"

Insights:

A majority of the unique stock codes (3676 out of 3684) contain exactly 5 numeric characters, which seems to be the standard format for representing product codes in this datase.

There are a few anomalies: 7 stock codes contain no numeric characters and 1 stock code contains only 1 numeric charactns.

Based on the ana the these anomalous codes are just a fraction among all unique stock codes (only 8 out of 3684). These codes seem to represent non-product transactions like "BANK CHARGES", "POST" (possibly postage fees), etc. Since they do not represent actual products and are a very small proportion of the dataset, including them in the analysis might introduce noise and distort the clustering and recommendation We have decided told be to filter out and remove rows with these anomalous stock codes from the dataset before proceeding with further nalysis.

In [312]:
data <- data[data$len_StockCode >= 5 & data$len_StockCode < 8, ]
In [313]:
dim(data)
  1. 399689
  2. 10

Zero unit prices may indicate free items or data entry errors. To understand their nature, it is essential to investigate these zero unit price transactions further.¶

In [60]:
boxplot(data$UnitPrice, notch = TRUE, main = "Box Plot of UnitPrice", ylab = "UnitPrice")
No description has been provided for this image

The box plot reveals a right-skewed distribution of UnitPrice with many outliers at the upper end, suggesting significant variability. Applying a logarithmic scale addresses this by compressing the high-priced outliers and improving visualization, aiding in identifying patterns and trends. This transformation enhances interpretability and facilitates analysis of e-commerce pricing data spanning multiple orders of magnitude. analysis.ization.ploration.

In [61]:
filtered_data <- subset(data, UnitPrice > 0)
# Disable scientific notation for numeric output
options(scipen = 999)
# Create a box plot of the filtered "UnitPrice" column with logarithmic scale on y-axis
boxplot(filtered_data$UnitPrice, notch = TRUE, main = "Box Plot of UnitPrice", ylab = "UnitPrice", log = "y")
No description has been provided for this image

The unit prices exhibit a right-skewed distribution, with most transactions involving lower-priced items and a few high-value outliers. The interquartile range suggests that half of the unit prices fall within a relatively narrow range. Outliers at the higher end, potentially representing luxury items, warrant further investigation for potential data errors.r electronics.

In [314]:
table(data$is_Cancelled)
 FALSE   TRUE 
391183   8506 
In [315]:
dim(data)
  1. 399689
  2. 10
In [316]:
summary(data$UnitPrice)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   1.250   1.950   2.908   3.750 649.500 
In [317]:
zero_price_data <- data[data$UnitPrice == 0, ]

# Summary statistics for the "Quantity" column of the filtered data
summary(zero_price_data$Quantity)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    1.0     2.0    11.0   420.5    36.0 12540.0 

The 33 transactions with zero unit price display significant variability in quantity, ranging from 1 to 12540 items with a notable standard deviation. Given the small number of these transactions, excluding them from the dataset is advisable to ensure the accuracy and reliability of the clustering model and recommendation.stem.

In [318]:
zero_price_data <- data[data$UnitPrice == 0, ]

# Display the filtered data
head(zero_price_data)
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountryis_Cancelledlen_StockCode
6843537197 22841 ROUND CAKE TIN VINTAGE GREEN 1 12/5/2010 14:02 0 12647 Germany FALSE 5
22620539263 22580 ADVENT CALENDAR GINGHAM SACK 4 12/16/2010 14:36 0 16560 United Kingdom FALSE 5
25552539722 22423 REGENCY CAKESTAND 3 TIER 10 12/21/2010 13:45 0 14911 EIRE FALSE 5
29375540372 22090 PAPER BUNTING RETROSPOT 24 1/6/2011 16:41 0 13081 United Kingdom FALSE 5
29377540372 22553 PLASTERS IN TIN SKULLS 24 1/6/2011 16:41 0 13081 United Kingdom FALSE 5
34904541109 22168 ORGANISER WOOD ANTIQUE WHITE 1 1/13/2011 15:10 0 15107 United Kingdom FALSE 5
In [319]:
# Filter the data to remove records with unit price of zero
data <- data[data$UnitPrice > 0, ]
In [320]:
str(data$InvoiceDate)
 Factor w/ 23260 levels "1/10/2011 10:04",..: 6839 6839 6839 6839 6839 6839 6839 6840 6840 6841 ...
In [321]:
copied_df <- data.frame(data)
In [323]:
library(dplyr)
In [324]:
# Convert InvoiceDate to datetime with specified format
data$InvoiceDate <- as.POSIXct(data$InvoiceDate, format = "%m/%d/%Y %H:%M")
In [325]:
# Set InvoiceDate as index
data <- data %>% 
  arrange(InvoiceDate) %>%  # Ensure data is sorted by InvoiceDate
  mutate(InvoiceDate = as.Date(InvoiceDate))  # Convert to Date type
In [326]:
# Aggregate sales data
data$Sales <- data$Quantity * data$UnitPrice
daily_sales <- data %>% 
  group_by(InvoiceDate) %>% 
  summarize(Sales = sum(Sales))
In [327]:
str(data$InvoiceDate)
 Date[1:399656], format: "2010-12-01" "2010-12-01" "2010-12-01" "2010-12-01" "2010-12-01" ...
In [328]:
head(data)
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountryis_Cancelledlen_StockCodeSales
536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 2010-12-01 2.55 17850 United Kingdom FALSE 6 15.30
536365 71053 WHITE METAL LANTERN 6 2010-12-01 3.39 17850 United Kingdom FALSE 5 20.34
536365 84406B CREAM CUPID HEARTS COAT HANGER 8 2010-12-01 2.75 17850 United Kingdom FALSE 6 22.00
536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE6 2010-12-01 3.39 17850 United Kingdom FALSE 6 20.34
536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 2010-12-01 3.39 17850 United Kingdom FALSE 6 20.34
536365 22752 SET 7 BABUSHKA NESTING BOXES 2 2010-12-01 7.65 17850 United Kingdom FALSE 5 15.30
In [329]:
dim(data)
  1. 399656
  2. 11
In [330]:
daily_sales
InvoiceDateSales
2010-12-0145737.56
2010-12-0242547.42
2010-12-0325283.73
2010-12-0530543.61
2010-12-0630061.75
2010-12-0753126.64
2010-12-0837809.22
2010-12-0935114.58
2010-12-1032759.73
2010-12-1217014.20
2010-12-1327223.84
2010-12-1426789.23
2010-12-1529324.76
2010-12-1643416.13
2010-12-1721228.41
2010-12-18 33.00
2010-12-19 7153.35
2010-12-2017334.56
2010-12-2115415.71
2010-12-22 4776.12
2010-12-23 5200.37
2011-01-0410858.36
2011-01-0528074.22
2011-01-0631704.85
2011-01-0722942.80
2011-01-0915334.53
2011-01-1014763.70
2011-01-1159042.51
2011-01-1216133.55
2011-01-1314782.36
......
2011-11-0642114.91
2011-11-0726475.81
2011-11-0836916.09
2011-11-0957275.99
2011-11-1062231.47
2011-11-1141158.03
2011-11-1327879.22
2011-11-1455826.58
2011-11-1546076.27
2011-11-1646215.65
2011-11-1750402.72
2011-11-1838015.95
2011-11-2029289.83
2011-11-2143374.72
2011-11-2247950.01
2011-11-2369390.52
2011-11-2433330.62
2011-11-2527200.94
2011-11-2716923.39
2011-11-2849077.83
2011-11-2947541.59
2011-11-3040094.66
2011-12-0140158.47
2011-12-0246765.47
2011-12-0419943.60
2011-12-0555332.24
2011-12-0642984.87
2011-12-0768347.61
2011-12-0833495.22
2011-12-0931124.54
In [64]:
# Load required packages
library(ggplot2)
library(tibble)

# Convert daily_sales dataframe to tibble
daily_sales <- as_tibble(daily_sales)

# Create time series plot using ggplot2
ggplot(daily_sales, aes(x = InvoiceDate, y = Sales)) +
  geom_line() +
  labs(title = "Daily Sales Over Time", x = "Date", y = "Sales")
Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
No description has been provided for this image
In [65]:
library(ggplot2)

# Aggregate sales by country
sales_by_country <- aggregate(Sales ~ Country, data = data, FUN = sum)

# Create bar plot
ggplot(sales_by_country, aes(x = Country, y = Sales)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Total Sales by Country", x = "Country", y = "Total Sales") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
No description has been provided for this image
In [66]:
# Create scatter plot
ggplot(data, aes(x = UnitPrice, y = Quantity)) +
  geom_point() +
  labs(title = "Scatter Plot of Quantity vs. Unit Price", x = "Unit Price", y = "Quantity")
No description has been provided for this image

Time Series Analysis¶

In [73]:
library(lubridate)
library(dplyr)
In [331]:
# Load required libraries
# Convert InvoiceDate to datetime with specified format
data$InvoiceDate <- as.POSIXct(data$InvoiceDate, format = "%m/%d/%Y %H:%M")

# Set InvoiceDate as index and convert to Date type
data <- data %>% 
  arrange(InvoiceDate) %>%  # Ensure data is sorted by InvoiceDate
  mutate(InvoiceDate = as.Date(InvoiceDate))

# Calculate sales
data$Sales <- data$Quantity * data$UnitPrice

# Aggregate sales data by month
monthly_sales <- data %>% 
  group_by(YearMonth = format(InvoiceDate, "%Y-%m")) %>%  # Extract year-month from date
  summarize(Sales = sum(Sales))
In [332]:
monthly_sales
YearMonthSales
2010-12 547893.9
2011-01 471430.3
2011-02 434018.2
2011-03 572153.2
2011-04 422112.7
2011-05 650314.2
2011-06 639609.2
2011-07 581234.8
2011-08 612605.1
2011-09 921808.7
2011-10 961042.5
2011-11 1113102.1
2011-12 338152.0
In [333]:
# Plot Monthly Sales
ggplot(monthly_sales, aes(x = YearMonth, y = Sales)) +
  geom_point() +
  labs(title = "Monthly Sales Over Time", x = "Month", y = "Sales")
No description has been provided for this image
In [334]:
library(lubridate)
library(dplyr)
library(plotly)
In [79]:
# Aggregate sales data by month
monthly_sales <- data %>%
  group_by(YearMonth = format(InvoiceDate, "%Y-%m")) %>%
  summarise(Sales = sum(Sales))

# Plot Monthly Sales
plotly::plot_ly(monthly_sales, x = ~YearMonth, y = ~Sales, type = "scatter", mode = "lines") %>%
  plotly::layout(title = "Monthly Sales Over Time", xaxis = list(title = "Month"), yaxis = list(title = "Sales"))
In [80]:
# Load required libraries
library(lubridate)
library(dplyr)
library(plotly)

# Aggregate sales data by week
weekly_sales <- data %>%
  group_by(Week = format(InvoiceDate, "%Y-%U")) %>%
  summarise(Sales = sum(Sales))

# Plot Weekly Sales
plotly::plot_ly(weekly_sales, x = ~Week, y = ~Sales, type = "scatter", mode = "lines") %>%
  plotly::layout(title = "Weekly Sales Over Time", xaxis = list(title = "Week"), yaxis = list(title = "Sales"))
In [335]:
head(data)
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountryis_Cancelledlen_StockCodeSales
536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 2010-12-01 2.55 17850 United Kingdom FALSE 6 15.30
536365 71053 WHITE METAL LANTERN 6 2010-12-01 3.39 17850 United Kingdom FALSE 5 20.34
536365 84406B CREAM CUPID HEARTS COAT HANGER 8 2010-12-01 2.75 17850 United Kingdom FALSE 6 22.00
536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE6 2010-12-01 3.39 17850 United Kingdom FALSE 6 20.34
536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 2010-12-01 3.39 17850 United Kingdom FALSE 6 20.34
536365 22752 SET 7 BABUSHKA NESTING BOXES 2 2010-12-01 7.65 17850 United Kingdom FALSE 5 15.30
In [336]:
# Calculate the number of unique values in the "StockCode" column
unique_stock_codes <- data %>%
  distinct(StockCode) %>%
  nrow()

# Print the number of unique stock codes
print(unique_stock_codes)
[1] 3676
In [337]:
# Calculate the number of unique values in the "CustomerID" column
unique_customer_ids <- data %>%
  distinct(CustomerID) %>%
  nrow()

# Print the number of unique customer IDs
print(unique_customer_ids)
[1] 4362
In [356]:
head(data)
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountryis_Cancelledlen_StockCodeSales
536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 2010-12-01 2.55 17850 United Kingdom FALSE 6 15.30
536365 71053 WHITE METAL LANTERN 6 2010-12-01 3.39 17850 United Kingdom FALSE 5 20.34
536365 84406B CREAM CUPID HEARTS COAT HANGER 8 2010-12-01 2.75 17850 United Kingdom FALSE 6 22.00
536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE6 2010-12-01 3.39 17850 United Kingdom FALSE 6 20.34
536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 2010-12-01 3.39 17850 United Kingdom FALSE 6 20.34
536365 22752 SET 7 BABUSHKA NESTING BOXES 2 2010-12-01 7.65 17850 United Kingdom FALSE 5 15.30

Pareto Principle Analysis¶

Implementing Pareto's 80-20 rule involves verifying if 20% of customers generate 80% of the revenue and if 20% of products account for 80% of sales volume or revenue. This analysis aims to identify significant contributors to revenue concentration and product popularity in the dataset, aligning with the Pareto principle's concept of the "vital few" driving majority outcomes.

In [340]:
# Step 1: Calculate Total Value per Key
value_per_key <- data %>%
  group_by(CustomerID) %>%
  summarise(Total_Value = sum(Sales))
In [341]:
head(value_per_key)
CustomerIDTotal_Value
12346 0.00
12347 4310.00
12348 1437.24
12349 1457.55
12350 294.40
12352 1265.41
In [342]:
value_per_key_sorted <- value_per_key %>%
  arrange(desc(Total_Value))
In [343]:
head(value_per_key_sorted)
CustomerIDTotal_Value
14646 278778.0
18102 259657.3
17450 189575.5
14911 128768.2
12415 123638.2
14156 113685.8
In [344]:
value_per_key_sorted$Cumulative_Percentage <- cumsum(value_per_key_sorted$Total_Value) / sum(value_per_key_sorted$Total_Value) * 100
In [345]:
head(value_per_key_sorted)
CustomerIDTotal_ValueCumulative_Percentage
14646 278778.0 3.372800
18102 259657.3 6.514268
17450 189575.5 8.807850
14911 128768.2 10.365755
12415 123638.2 11.861593
14156 113685.8 13.237022
In [346]:
# Calculate the percentage of customers contributing to 80% of sales
perc <- round(sum(value_per_key_sorted$Cumulative_Percentage < 80) / nrow(value_per_key_sorted) * 100)

# Print the result
print(paste0(perc, "% of Customers contribute to 80% of sales."))
[1] "27% of Customers contribute to 80% of sales."

27% of Customers contribute to 80% of sales.¶

In [347]:
# Calculate the number of rows where Cumulative_Percentage < 80
num_rows <- nrow(value_per_key_sorted[value_per_key_sorted$Cumulative_Percentage < 80, ])

# Print the result
print(num_rows)
[1] 1163
In [348]:
# Step 1: Calculate Total Value per Key
value_per_key_stock <- data %>%
  group_by(StockCode) %>%
  summarise(Total_Value = sum(Sales))
In [349]:
value_per_key_stock_sorted <- value_per_key_stock %>%
  arrange(desc(Total_Value))
In [350]:
value_per_key_stock_sorted$Cumulative_Percentage <- cumsum(value_per_key_stock_sorted$Total_Value) / sum(value_per_key_stock_sorted$Total_Value) * 100
In [353]:
head(value_per_key_stock_sorted)
StockCodeTotal_ValueCumulative_Percentage
22423 132567.701.603872
85123A 93923.152.740203
85099B 83056.523.745064
47566 67628.434.563267
84879 56331.915.244800
23084 51042.845.862342
In [354]:
# Calculate the percentage of products contributing to 80% of sales
perc <- round(sum(value_per_key_stock_sorted$Cumulative_Percentage < 80) / nrow(value_per_key_stock_sorted) * 100)

# Print the result
print(paste0(perc, "% of Products contribute to 80% of sales."))
[1] "22% of Products contribute to 80% of sales."

22% of Products contribute to 80% of sales.¶

In [355]:
# Calculate the number of rows where Cumulative_Percentage < 80
num_rows <- nrow(value_per_key_stock_sorted[value_per_key_stock_sorted$Cumulative_Percentage < 80, ])

# Print the result
print(num_rows)
[1] 800

Market Basket Analysis¶

Association Rules: Discover commonly co-purchased products by applying market basket analysis.

In [101]:
# Load required libraries
library(arules)
library(dplyr)
library(tidyr)
In [103]:
# Step 1: Group data by InvoiceNo and create a list of items purchased in each transaction
transactions <- data %>%
  group_by(InvoiceNo) %>%
  summarize(Items = list(StockCode))

# Step 2: Remove duplicated items in transactions
transactions$Items <- lapply(transactions$Items, unique)

# Step 3: Convert transactions into a transaction object
transactions <- as(transactions$Items, "transactions")

# Step 4: Mine frequent itemsets
frequent_itemsets <- apriori(transactions, parameter = list(support = 0.01))

# Step 5: Find association rules
rules <- apriori(transactions, parameter = list(support = 0.01, confidence = 0.5))

# Step 6: Sort and display the top 10 association rules by lift
rules_df <- as.data.frame(inspect(rules))
top_10_rules <- rules_df[order(-rules_df$lift), ][1:10, ]

# Display the top 10 association rules
print(top_10_rules)
Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.8    0.1    1 none FALSE            TRUE       5    0.01      1
 maxlen target   ext
     10  rules FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 217 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[3676 item(s), 21785 transaction(s)] done [0.25s].
sorting and recoding items ... [532 item(s)] done [0.01s].
creating transaction tree ... done [0.01s].
checking subsets of size 1 2 3 4 done [0.03s].
writing ... [10 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].
Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.5    0.1    1 none FALSE            TRUE       5    0.01      1
 maxlen target   ext
     10  rules FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 217 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[3676 item(s), 21785 transaction(s)] done [0.26s].
sorting and recoding items ... [532 item(s)] done [0.00s].
creating transaction tree ... done [0.02s].
checking subsets of size 1 2 3 4 done [0.03s].
writing ... [196 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].
      lhs                    rhs      support    confidence lift      count
[1]   {21086}             => {21094}  0.01083314 0.8280702  55.506181 236  
[2]   {21094}             => {21086}  0.01083314 0.7261538  55.506181 236  
[3]   {84997C}            => {84997D} 0.01051182 0.7387097  37.688034 229  
[4]   {84997D}            => {84997C} 0.01051182 0.5362998  37.688034 229  
[5]   {23254}             => {23256}  0.01005279 0.7711268  50.599387 219  
[6]   {23256}             => {23254}  0.01005279 0.6596386  50.599387 219  
[7]   {23171}             => {23170}  0.01064953 0.8436364  54.054759 232  
[8]   {23170}             => {23171}  0.01064953 0.6823529  54.054759 232  
[9]   {47590B}            => {47590A} 0.01175120 0.7052342  42.914878 256  
[10]  {47590A}            => {47590B} 0.01175120 0.7150838  42.914878 256  
[11]  {22725}             => {22727}  0.01055772 0.6865672  16.490480 230  
[12]  {21136}             => {84879}  0.01175120 0.7231638  11.374819 256  
[13]  {22729}             => {22726}  0.01028230 0.5957447  16.042395 224  
[14]  {22729}             => {22727}  0.01165940 0.6755319  16.225428 254  
[15]  {23295}             => {23293}  0.01133808 0.6785714  31.452508 247  
[16]  {23293}             => {23295}  0.01133808 0.5255319  31.452508 247  
[17]  {22579}             => {22578}  0.01042001 0.8194946  40.946536 227  
[18]  {22578}             => {22579}  0.01042001 0.5206422  40.946536 227  
[19]  {21094}             => {21080}  0.01051182 0.7046154  20.912869 229  
[20]  {82581}             => {82580}  0.01042001 0.7394137  41.515791 227  
[21]  {82580}             => {82581}  0.01042001 0.5850515  41.515791 227  
[22]  {23243}             => {22720}  0.01009869 0.5250597   9.391153 220  
[23]  {22745}             => {22748}  0.01170530 0.7993730  49.897827 255  
[24]  {22748}             => {22745}  0.01170530 0.7306590  49.897827 255  
[25]  {22625}             => {22624}  0.01239385 0.5793991  20.067107 270  
[26]  {23343}             => {23344}  0.01133808 0.6301020  23.148015 247  
[27]  {22749}             => {22750}  0.01092495 0.5935162  34.025660 238  
[28]  {22750}             => {22749}  0.01092495 0.6263158  34.025660 238  
[29]  {22730}             => {22726}  0.01266927 0.5750000  15.483776 276  
[30]  {22730}             => {22727}  0.01459720 0.6625000  15.912417 318  
[31]  {84596F}            => {84596B} 0.01037411 0.7820069  47.060831 226  
[32]  {84596B}            => {84596F} 0.01037411 0.6243094  47.060831 226  
[33]  {22617}             => {22138}  0.01432178 0.7107062  17.574045 312  
[34]  {23200}             => {23199}  0.01450539 0.6638655  19.570110 316  
[35]  {23200}             => {23202}  0.01106266 0.5063025  14.177121 241  
[36]  {23200}             => {23203}  0.01248566 0.5714286  11.347832 272  
[37]  {22328}             => {22326}  0.01225614 0.5644820  20.093531 267  
[38]  {82483}             => {82486}  0.01294469 0.5136612  19.293292 282  
[39]  {21231}             => {21232}  0.01188891 0.6348039  21.641946 259  
[40]  {21930}             => {85099B} 0.01142988 0.5253165   6.965319 249  
[41]  {22385}             => {85099B} 0.01234795 0.5592516   7.415274 269  
[42]  {22866}             => {22867}  0.01106266 0.5073684  23.027127 241  
[43]  {22867}             => {22866}  0.01106266 0.5020833  23.027127 241  
[44]  {22866}             => {22865}  0.01289878 0.5915789  22.105570 281  
[45]  {20712}             => {85099B} 0.01216433 0.5289421   7.013393 265  
[46]  {22114}             => {22112}  0.01198072 0.5038610  15.525618 261  
[47]  {22698}             => {22697}  0.02162038 0.7969543  23.685743 471  
[48]  {22697}             => {22698}  0.02162038 0.6425648  23.685743 471  
[49]  {22698}             => {22699}  0.02070232 0.7631134  19.885675 451  
[50]  {22699}             => {22698}  0.02070232 0.5394737  19.885675 451  
[51]  {22698}             => {22423}  0.01464310 0.5397631   6.244684 319  
[52]  {21928}             => {21929}  0.01303649 0.5144928  18.254438 284  
[53]  {21928}             => {85099B} 0.01491852 0.5887681   7.806642 325  
[54]  {22835}             => {22112}  0.01331191 0.5686275  17.521286 290  
[55]  {22804}             => {85123A} 0.01400046 0.7314149   7.888056 305  
[56]  {22578}             => {22577}  0.01468901 0.7339450  34.384927 320  
[57]  {22577}             => {22578}  0.01468901 0.6881720  34.384927 320  
[58]  {23322}             => {23321}  0.01294469 0.5412668  20.122009 282  
[59]  {22867}             => {22865}  0.01230204 0.5583333  20.863279 268  
[60]  {85099C}            => {85099B} 0.01698416 0.5580694   7.399599 370  
[61]  {20719}             => {20724}  0.01331191 0.5380334  15.524579 290  
[62]  {20971}             => {20972}  0.01083314 0.5119306  20.767985 236  
[63]  {21791}             => {21790}  0.01147579 0.5122951  15.393584 250  
[64]  {21914}             => {21915}  0.01248566 0.5240848  20.065355 272  
[65]  {85099F}            => {85099B} 0.01937113 0.6326837   8.388931 422  
[66]  {22356}             => {20724}  0.01496443 0.6468254  18.663697 326  
[67]  {20723}             => {20724}  0.01377094 0.5952381  17.175181 300  
[68]  {23208}             => {23206}  0.01404636 0.5091514  12.633102 306  
[69]  {23208}             => {23209}  0.01464310 0.5307820  11.369800 319  
[70]  {21929}             => {85099B} 0.01537755 0.5456026   7.234299 335  
[71]  {22355}             => {20724}  0.01381685 0.5041876  14.547983 301  
[72]  {22728}             => {22726}  0.01579068 0.5468998  14.727087 344  
[73]  {22728}             => {22727}  0.01849897 0.6406995  15.388797 403  
[74]  {22697}             => {22699}  0.02556805 0.7598909  19.801701 557  
[75]  {22699}             => {22697}  0.02556805 0.6662679  19.801701 557  
[76]  {22697}             => {22423}  0.01758090 0.5225102   6.045080 383  
[77]  {22662}             => {22382}  0.01634152 0.5884298  12.806136 356  
[78]  {22662}             => {20725}  0.01418407 0.5107438   8.365830 309  
[79]  {82494L}            => {82482}  0.02175809 0.5738499  13.983579 474  
[80]  {82482}             => {82494L} 0.02175809 0.5302013  13.983579 474  
[81]  {22699}             => {22423}  0.01983016 0.5167464   5.978396 432  
[82]  {23202}             => {23203}  0.01973835 0.5526992  10.975891 430  
[83]  {22551}             => {22554}  0.01349552 0.5008518  18.430838 294  
[84]  {22386}             => {85099B} 0.02543034 0.6266968   8.309550 554  
[85]  {21931}             => {85099B} 0.02033509 0.5600506   7.425868 443  
[86]  {22630}             => {22629}  0.01946293 0.6784000  20.756944 424  
[87]  {22629}             => {22630}  0.01946293 0.5955056  20.756944 424  
[88]  {21975}             => {21212}  0.01230204 0.5037594  10.542170 268  
[89]  {22910}             => {22086}  0.02084003 0.6403385  14.090681 454  
[90]  {22726}             => {22727}  0.02460409 0.6625464  15.913531 536  
[91]  {22727}             => {22726}  0.02460409 0.5909592  15.913531 536  
[92]  {21733}             => {85123A} 0.02120725 0.6676301   7.200159 462  
[93]  {21977}             => {21212}  0.01551526 0.5007407  10.478998 338  
[94]  {23300}             => {23301}  0.02139087 0.7258567  20.616412 466  
[95]  {23301}             => {23300}  0.02139087 0.6075619  20.616412 466  
[96]  {84991}             => {21212}  0.01533165 0.5030120  10.526530 334  
[97]  {20726}             => {20725}  0.02015148 0.5232420   8.570546 439  
[98]  {22384}             => {20725}  0.02432867 0.5573081   9.128539 530  
[99]  {22697,22698}       => {22699}  0.01822355 0.8428875  21.964478 397  
[100] {22698,22699}       => {22697}  0.01822355 0.8802661  26.161796 397  
[101] {22697,22699}       => {22698}  0.01822355 0.7127469  26.272742 397  
[102] {22697,22698}       => {22423}  0.01257746 0.5817410   6.730338 274  
[103] {22423,22698}       => {22697}  0.01257746 0.8589342  25.527805 274  
[104] {22423,22697}       => {22698}  0.01257746 0.7154047  26.370713 274  
[105] {22698,22699}       => {22423}  0.01243975 0.6008869   6.951844 271  
[106] {22423,22698}       => {22699}  0.01243975 0.8495298  22.137567 271  
[107] {22423,22699}       => {22698}  0.01243975 0.6273148  23.123610 271  
[108] {22386,85099F}      => {85099B} 0.01069543 0.7925170  10.508206 233  
[109] {85099B,85099F}     => {22386}  0.01069543 0.5521327  13.606573 233  
[110] {22726,22728}       => {22727}  0.01230204 0.7790698  18.712277 268  
[111] {22727,22728}       => {22726}  0.01230204 0.6650124  17.907658 268  
[112] {22726,22727}       => {22728}  0.01230204 0.5000000  17.317170 268  
[113] {22697,22699}       => {22423}  0.01450539 0.5673250   6.563555 316  
[114] {22423,22697}       => {22699}  0.01450539 0.8250653  21.500056 316  
[115] {22423,22699}       => {22697}  0.01450539 0.7314815  21.739869 316  
[116] {23202,23203}       => {85099B} 0.01046592 0.5302326   7.030503 228  
[117] {23202,85099B}      => {23203}  0.01046592 0.6570605  13.048371 228  
[118] {21931,22386}       => {85099B} 0.01023640 0.7852113  10.411337 223  
[119] {21931,85099B}      => {22386}  0.01023640 0.5033860  12.405276 223  
[120] {23206,23209}       => {20725}  0.01037411 0.5931759   9.716042 226  
[121] {20725,23206}       => {23209}  0.01037411 0.5765306  12.349773 226  
[122] {20725,23209}       => {23206}  0.01037411 0.5368171  13.319545 226  
[123] {22383,23206}       => {20725}  0.01009869 0.6077348   9.954513 220  
[124] {20725,23206}       => {22383}  0.01009869 0.5612245  11.501670 220  
[125] {20726,22384}       => {20725}  0.01101675 0.7430341  12.170674 240  
[126] {20725,20726}       => {22384}  0.01101675 0.5466970  12.523444 240  
[127] {20726,20728}       => {20725}  0.01078724 0.6368564  10.431516 235  
[128] {20725,20726}       => {20728}  0.01078724 0.5353075  11.791379 235  
[129] {20725,20728}       => {20726}  0.01078724 0.5042918  13.094157 235  
[130] {20726,20727}       => {20725}  0.01064953 0.6590909  10.795711 232  
[131] {20725,20726}       => {20727}  0.01064953 0.5284738  10.729545 232  
[132] {20726,22382}       => {22383}  0.01005279 0.5354523  10.973498 219  
[133] {20726,22383}       => {22382}  0.01005279 0.5983607  13.022265 219  
[134] {20726,22382}       => {20725}  0.01193482 0.6356968  10.412523 260  
[135] {20725,20726}       => {22382}  0.01193482 0.5922551  12.889389 260  
[136] {20725,22382}       => {20726}  0.01193482 0.5531915  14.363858 260  
[137] {20726,22383}       => {20725}  0.01184301 0.7049180  11.546345 258  
[138] {20725,20726}       => {22383}  0.01184301 0.5876993  12.044242 258  
[139] {20728,22384}       => {20727}  0.01198072 0.6041667  12.266329 261  
[140] {20727,22384}       => {20728}  0.01198072 0.5588865  12.310761 261  
[141] {20727,20728}       => {22384}  0.01198072 0.6141176  14.067879 261  
[142] {20728,22384}       => {22382}  0.01032821 0.5208333  11.335019 225  
[143] {22382,22384}       => {20728}  0.01032821 0.6081081  13.394980 225  
[144] {20728,22382}       => {22384}  0.01032821 0.5696203  13.048556 225  
[145] {20728,22384}       => {22383}  0.01074134 0.5416667  11.100854 234  
[146] {22383,22384}       => {20728}  0.01074134 0.5792079  12.758387 234  
[147] {20728,22383}       => {22384}  0.01074134 0.5120350  11.729425 234  
[148] {20728,22384}       => {20725}  0.01294469 0.6527778  10.692304 282  
[149] {20725,22384}       => {20728}  0.01294469 0.5320755  11.720186 282  
[150] {20725,20728}       => {22384}  0.01294469 0.6051502  13.862458 282  
[151] {22382,22384}       => {20727}  0.01037411 0.6108108  12.401224 226  
[152] {20727,22382}       => {22384}  0.01037411 0.5393795  12.355817 226  
[153] {20727,22384}       => {22383}  0.01120037 0.5224839  10.707726 244  
[154] {22383,22384}       => {20727}  0.01120037 0.6039604  12.262141 244  
[155] {20727,22383}       => {22384}  0.01120037 0.5224839  11.968783 244  
[156] {20727,22384}       => {20725}  0.01418407 0.6616702  10.837959 309  
[157] {20725,22384}       => {20727}  0.01418407 0.5830189  11.836967 309  
[158] {20725,20727}       => {22384}  0.01418407 0.5908222  13.534239 309  
[159] {22382,22384}       => {20725}  0.01161350 0.6837838  11.200173 253  
[160] {20725,22382}       => {22384}  0.01161350 0.5382979  12.331040 253  
[161] {22383,22384}       => {20725}  0.01299059 0.7004950  11.473898 283  
[162] {20725,22384}       => {22383}  0.01299059 0.5339623  10.942961 283  
[163] {20725,22383}       => {22384}  0.01299059 0.5380228  12.324739 283  
[164] {20727,20728}       => {22382}  0.01028230 0.5270588  11.470506 224  
[165] {20728,22382}       => {20727}  0.01028230 0.5670886  11.513537 224  
[166] {20727,22382}       => {20728}  0.01028230 0.5346062  11.775931 224  
[167] {20727,20728}       => {22383}  0.01156759 0.5929412  12.151668 252  
[168] {20728,22383}       => {20727}  0.01156759 0.5514223  11.195466 252  
[169] {20727,22383}       => {20728}  0.01156759 0.5396146  11.886252 252  
[170] {20727,20728}       => {20725}  0.01207253 0.6188235  10.136143 263  
[171] {20725,20728}       => {20727}  0.01207253 0.5643777  11.458497 263  
[172] {20725,20727}       => {20728}  0.01207253 0.5028681  11.076826 263  
[173] {20728,22382}       => {22383}  0.01106266 0.6101266  12.503864 241  
[174] {20728,22383}       => {22382}  0.01106266 0.5273523  11.476893 241  
[175] {22382,22383}       => {20728}  0.01106266 0.5343681  11.770686 241  
[176] {20728,22382}       => {20725}  0.01087905 0.6000000   9.827820 237  
[177] {20725,20728}       => {22382}  0.01087905 0.5085837  11.068427 237  
[178] {20725,22382}       => {20728}  0.01087905 0.5042553  11.107383 237  
[179] {20728,22383}       => {20725}  0.01276107 0.6083151   9.964018 278  
[180] {20725,20728}       => {22383}  0.01276107 0.5965665  12.225966 278  
[181] {20725,22383}       => {20728}  0.01276107 0.5285171  11.641805 278  
[182] {20727,22382}       => {22383}  0.01156759 0.6014320  12.325678 252  
[183] {20727,22383}       => {22382}  0.01156759 0.5396146  11.743759 252  
[184] {22382,22383}       => {20727}  0.01156759 0.5587583  11.344408 252  
[185] {20727,22382}       => {20725}  0.01179711 0.6133652  10.046737 257  
[186] {20725,22382}       => {20727}  0.01179711 0.5468085  11.101793 257  
[187] {20727,22383}       => {20725}  0.01294469 0.6038544   9.890953 282  
[188] {20725,20727}       => {22383}  0.01294469 0.5391969  11.050240 282  
[189] {20725,22383}       => {20727}  0.01294469 0.5361217  10.884819 282  
[190] {22382,22383}       => {20725}  0.01253156 0.6053215   9.914984 273  
[191] {20725,22382}       => {22383}  0.01253156 0.5808511  11.903895 273  
[192] {20725,22383}       => {22382}  0.01253156 0.5190114  11.295368 273  
[193] {22697,22698,22699} => {22423}  0.01106266 0.6070529   7.023180 241  
[194] {22423,22697,22698} => {22699}  0.01106266 0.8795620  22.920166 241  
[195] {22423,22698,22699} => {22697}  0.01106266 0.8892989  26.430254 241  
[196] {22423,22697,22699} => {22698}  0.01106266 0.7626582  28.112537 241  
          lhs         rhs    support confidence     lift count
[1]   {21086} =>  {21094} 0.01083314  0.8280702 55.50618   236
[2]   {21094} =>  {21086} 0.01083314  0.7261538 55.50618   236
[7]   {23171} =>  {23170} 0.01064953  0.8436364 54.05476   232
[8]   {23170} =>  {23171} 0.01064953  0.6823529 54.05476   232
[5]   {23254} =>  {23256} 0.01005279  0.7711268 50.59939   219
[6]   {23256} =>  {23254} 0.01005279  0.6596386 50.59939   219
[23]  {22745} =>  {22748} 0.01170530  0.7993730 49.89783   255
[24]  {22748} =>  {22745} 0.01170530  0.7306590 49.89783   255
[31] {84596F} => {84596B} 0.01037411  0.7820069 47.06083   226
[32] {84596B} => {84596F} 0.01037411  0.6243094 47.06083   226
In [210]:
rules_df
lhsrhssupportconfidenceliftcount
[1]{21086} => {21094} 0.010833140.8280702 55.506181 236
[2]{21094} => {21086} 0.010833140.7261538 55.506181 236
[3]{84997C} => {84997D} 0.010511820.7387097 37.688034 229
[4]{84997D} => {84997C} 0.010511820.5362998 37.688034 229
[5]{23254} => {23256} 0.010052790.7711268 50.599387 219
[6]{23256} => {23254} 0.010052790.6596386 50.599387 219
[7]{23171} => {23170} 0.010649530.8436364 54.054759 232
[8]{23170} => {23171} 0.010649530.6823529 54.054759 232
[9]{47590B} => {47590A} 0.011751200.7052342 42.914878 256
[10]{47590A} => {47590B} 0.011751200.7150838 42.914878 256
[11]{22725} => {22727} 0.010557720.6865672 16.490480 230
[12]{21136} => {84879} 0.011751200.7231638 11.374819 256
[13]{22729} => {22726} 0.010282300.5957447 16.042395 224
[14]{22729} => {22727} 0.011659400.6755319 16.225428 254
[15]{23295} => {23293} 0.011338080.6785714 31.452508 247
[16]{23293} => {23295} 0.011338080.5255319 31.452508 247
[17]{22579} => {22578} 0.010420010.8194946 40.946536 227
[18]{22578} => {22579} 0.010420010.5206422 40.946536 227
[19]{21094} => {21080} 0.010511820.7046154 20.912869 229
[20]{82581} => {82580} 0.010420010.7394137 41.515791 227
[21]{82580} => {82581} 0.010420010.5850515 41.515791 227
[22]{23243} => {22720} 0.010098690.5250597 9.391153 220
[23]{22745} => {22748} 0.011705300.7993730 49.897827 255
[24]{22748} => {22745} 0.011705300.7306590 49.897827 255
[25]{22625} => {22624} 0.012393850.5793991 20.067107 270
[26]{23343} => {23344} 0.011338080.6301020 23.148015 247
[27]{22749} => {22750} 0.010924950.5935162 34.025660 238
[28]{22750} => {22749} 0.010924950.6263158 34.025660 238
[29]{22730} => {22726} 0.012669270.5750000 15.483776 276
[30]{22730} => {22727} 0.014597200.6625000 15.912417 318
........................
[167]{20727,20728} => {22383} 0.01156759 0.5929412 12.151668 252
[168]{20728,22383} => {20727} 0.01156759 0.5514223 11.195466 252
[169]{20727,22383} => {20728} 0.01156759 0.5396146 11.886252 252
[170]{20727,20728} => {20725} 0.01207253 0.6188235 10.136143 263
[171]{20725,20728} => {20727} 0.01207253 0.5643777 11.458497 263
[172]{20725,20727} => {20728} 0.01207253 0.5028681 11.076826 263
[173]{20728,22382} => {22383} 0.01106266 0.6101266 12.503864 241
[174]{20728,22383} => {22382} 0.01106266 0.5273523 11.476893 241
[175]{22382,22383} => {20728} 0.01106266 0.5343681 11.770686 241
[176]{20728,22382} => {20725} 0.01087905 0.6000000 9.827820 237
[177]{20725,20728} => {22382} 0.01087905 0.5085837 11.068427 237
[178]{20725,22382} => {20728} 0.01087905 0.5042553 11.107383 237
[179]{20728,22383} => {20725} 0.01276107 0.6083151 9.964018 278
[180]{20725,20728} => {22383} 0.01276107 0.5965665 12.225966 278
[181]{20725,22383} => {20728} 0.01276107 0.5285171 11.641805 278
[182]{20727,22382} => {22383} 0.01156759 0.6014320 12.325678 252
[183]{20727,22383} => {22382} 0.01156759 0.5396146 11.743759 252
[184]{22382,22383} => {20727} 0.01156759 0.5587583 11.344408 252
[185]{20727,22382} => {20725} 0.01179711 0.6133652 10.046737 257
[186]{20725,22382} => {20727} 0.01179711 0.5468085 11.101793 257
[187]{20727,22383} => {20725} 0.01294469 0.6038544 9.890953 282
[188]{20725,20727} => {22383} 0.01294469 0.5391969 11.050240 282
[189]{20725,22383} => {20727} 0.01294469 0.5361217 10.884819 282
[190]{22382,22383} => {20725} 0.01253156 0.6053215 9.914984 273
[191]{20725,22382} => {22383} 0.01253156 0.5808511 11.903895 273
[192]{20725,22383} => {22382} 0.01253156 0.5190114 11.295368 273
[193]{22697,22698,22699}=> {22423} 0.01106266 0.6070529 7.023180 241
[194]{22423,22697,22698}=> {22699} 0.01106266 0.8795620 22.920166 241
[195]{22423,22698,22699}=> {22697} 0.01106266 0.8892989 26.430254 241
[196]{22423,22697,22699}=> {22698} 0.01106266 0.7626582 28.112537 241
  1. Grouping data by InvoiceNo column to create transaction lists.
  2. Removing duplicated items within transactions to ensure uniqueness.
  3. Converting transaction lists into objects required for the Apriori algorithm.
  4. Mining frequent itemsets with a support threshold of 0.01 and deriving association rules with a confidence threshold of 0.5, then sorting and displaying the top 10 rules by lift.
In [104]:
top_10_rules
lhsrhssupportconfidenceliftcount
[1]{21086} => {21094} 0.010833140.8280702 55.50618 236
[2]{21094} => {21086} 0.010833140.7261538 55.50618 236
[7]{23171} => {23170} 0.010649530.8436364 54.05476 232
[8]{23170} => {23171} 0.010649530.6823529 54.05476 232
[5]{23254} => {23256} 0.010052790.7711268 50.59939 219
[6]{23256} => {23254} 0.010052790.6596386 50.59939 219
[23]{22745} => {22748} 0.011705300.7993730 49.89783 255
[24]{22748} => {22745} 0.011705300.7306590 49.89783 255
[31]{84596F} => {84596B} 0.010374110.7820069 47.06083 226
[32]{84596B} => {84596F} 0.010374110.6243094 47.06083 226
  1. The lhs column indicates the antecedent of association rules, representing items found together in the dataset.
  2. The rhs column denotes the consequent of association rules, signifying items often purchased alongside those on the lhs.
  3. Support measures the frequency of occurrence, confidence quantifies rule strength, and lift assesses the degree of association between lhs and rhs items.side.
In [105]:
filtered_data <- subset(data, StockCode == "22748")
filtered_data$Description[1]
'POPPY\'S PLAYHOUSE KITCHEN'
In [106]:
filtered_data <- subset(data, StockCode == "22745")
filtered_data$Description[1]
'POPPY\'S PLAYHOUSE BEDROOM'

Example:¶

{22745} => {22748} support: 0.01170530 confidence: 0.7993730 lift: 49.89783 count: 255¶

Explanation:¶

lhs: {22745} (left-hand side) indicates the item "22745"¶

¶

rhs: {22748} (right-hand side) indicates the item "2274 " .#### support: 0.01170530 means that the combination of item "22745" and item "22748" occurs in approximately 1.17% of all transacto ns

.¶

confidence: 0.7993730 means that in 79.94% of transactions containing item "22745", item "22748" is also pes e n#### t. lift: 49.89783 indicates that the likelihood of item "22748" being purchased when item "22745" is also purchased is approximately 49.90 times higher than if they were purchased indepnde n t#### ly. count: 255 represents the absolute count of transactions where both items "22745" and "22748" are pr.

RMF Analysis¶

RFM Analysis is a customer segmentation technique that evaluates Recency, Frequency, and Monetary value to identify high-value customers. Recency indicates responsiveness to promotions, Frequency reflects engagement and loyalty, and Monetary value distinguishes between heavy spenders and low-value purchasers. By analyzing these metrics, marketers can tailor communications to specific customer segments, enhancing sales opportunities and customer retention.

Here's a breakdown of each component:

Recency (R) measures how recently a customer made a purchase, with higher scores indicating more responsiveness to promotions. Frequency (F) gauges how often a customer purchases, indicating engagement and loyalty. Monetary (M) reflects the amount a customer spends, distinguishing between heavy spenders and low-value purchasers.

In [107]:
# Calculate Monetary value for each transaction if not already included
data$Monetary <- data$Quantity * data$UnitPrice

# Ensure InvoiceDate is a datetime type
data$InvoiceDate <- as.POSIXct(data$InvoiceDate, format = "%Y-%m-%d %H:%M:%S")

# Calculate Recency in days
current_date <- as.Date(max(data$InvoiceDate)) + 1
data$Recency <- as.numeric(difftime(current_date, data$InvoiceDate, units = "days"))

# Group by CustomerID to calculate RFM values per customer
rfm_df <- data %>%
  group_by(CustomerID) %>%
  summarize(
    Recency = min(Recency),             # Most recent purchase date for each customer
    Frequency = n_distinct(InvoiceNo),  # Number of unique transactions (counts of invoices) for frequency
    Monetary = sum(Monetary)            # Total sum for monetary value
  )

# Inspecting the first few rows of the RFM dataframe
head(rfm_df)
CustomerIDRecencyFrequencyMonetary
12346 326 2 0.00
12347 3 7 4310.00
12348 76 4 1437.24
12349 19 1 1457.55
12350 311 1 294.40
12352 37 8 1265.41

Recency: Lower values signify more recent purchases, with customer 12347 being the most recent (Recency = 3) and customer 12350 the least recent (Recency = 311).

Frequency: Higher values indicate more purchases, with customer 12352 having the highest frequency (Frequency = 8) and customers 12349 and 12350 the lowest (Frequency = 1).

Monetary: Higher values denote higher spending, with customer 12347 having the highest monetary value ($4310.00) and customer 12346 the lowest ($0.00).

Customer Segmentation - K means Clustering¶

  1. Customer segmentation enhances marketing by tailoring experiences and campaigns.
  2. Relevant attributes are vital for effective segmentation strategies.
  3. K-Means clustering is a key tool for grouping customers based on similarities.
In [108]:
unique_customers <- data %>%
  distinct(CustomerID) %>%
  nrow()

print(unique_customers)
[1] 4362
In [109]:
dim(rfm_df)
  1. 4362
  2. 4
In [110]:
head(rfm_df)
CustomerIDRecencyFrequencyMonetary
12346 326 2 0.00
12347 3 7 4310.00
12348 76 4 1437.24
12349 19 1 1457.55
12350 311 1 294.40
12352 37 8 1265.41
In [111]:
# Boxplot for Recency
ggplot(data = rfm_df, aes(x = "", y = Recency)) +
  geom_boxplot() +
  labs(title = "Boxplot of Recency")
No description has been provided for this image
In [112]:
# Boxplot for Frequency
ggplot(data = rfm_df, aes(x = "", y = Frequency)) +
  geom_boxplot() +
  labs(title = "Boxplot of Frequency")
No description has been provided for this image
In [113]:
# Boxplot for Monetary
ggplot(data = rfm_df, aes(x = "", y = Monetary)) +
  geom_boxplot() +
  labs(title = "Boxplot of Monetary")
No description has been provided for this image
In [114]:
# Remove the CustomerID column
new_df <- rfm_df[, c("Recency", "Frequency", "Monetary")]

# Remove outliers
new_df <- new_df[!apply(new_df, 1, function(x) any(abs(scale(x)) > 3)), ]
In [115]:
head(new_df)
RecencyFrequencyMonetary
326 2 0.00
3 7 4310.00
76 4 1437.24
19 1 1457.55
311 1 294.40
37 8 1265.41
In [116]:
# Select the columns
col_names <- c("Recency", "Frequency", "Monetary")
features <- new_df[, col_names]

# Scale the features
scaled_features <- scale(features)

# Convert the scaled features back to a dataframe
scaled_features <- as.data.frame(scaled_features)

# Rename the columns
colnames(scaled_features) <- col_names
In [117]:
head(scaled_features)
RecencyFrequencyMonetary
2.3105470 -0.3296525 -0.22923618
-0.8882559 0.2208200 0.29217240
-0.1653067 -0.1094635 -0.05536396
-0.7298013 -0.4397470 -0.05290693
2.1619958 -0.4397470 -0.19362071
-0.5515398 0.3309144 -0.07615135
In [118]:
# Vector to store SSE values
SSE <- numeric(0)

# Loop through different numbers of clusters
for (cluster in 1:9) {
  # Fit KMeans model
  kmeans_model <- kmeans(scaled_features, centers = cluster, nstart = 10)
  
  # Calculate SSE and store it
  SSE <- c(SSE, kmeans_model$tot.withinss)
}

# Create a dataframe with Cluster and SSE values
frame <- data.frame(Cluster = 1:9, SSE = SSE)

# Plot SSE against number of clusters
plot(frame$Cluster, frame$SSE, type = "b", pch = 19, xlab = "Number of clusters", ylab = "Inertia")
No description has been provided for this image
In [122]:
k <- 4  # Number of clusters
# Fit the KMeans model
kmeans_model <- kmeans(scaled_features, centers = k, nstart = 25)

# Get cluster labels
cluster_labels <- kmeans_model$cluster

# Combine cluster labels with the original dataframe
rfm_df$Cluster <- cluster_labels
In [123]:
print(kmeans_model$centers)
     Recency   Frequency    Monetary
1 -0.7992213  2.43874285  1.18815081
2 -0.8648478 11.25028573 14.82165577
3  1.5570307 -0.35259309 -0.17376008
4 -0.4884870 -0.08048749 -0.07198866
In [124]:
head(rfm_df)
CustomerIDRecencyFrequencyMonetaryCluster
12346 326 2 0.003
12347 3 7 4310.004
12348 76 4 1437.244
12349 19 1 1457.554
12350 311 1 294.403
12352 37 8 1265.414
In [125]:
library(dplyr)
library(ggplot2)
In [127]:
# Group by cluster and calculate mean values for each feature
avg_df <- rfm_df %>%
  group_by(Cluster) %>%
  summarise(
    Recency = mean(Recency),
    Frequency = mean(Frequency),
    Monetary = mean(Monetary)
  )
In [129]:
avg_df
ClusterRecencyFrequencyMonetary
1 11.990291 27.145631 11716.2205
2 5.363636 107.181818 124411.7309
3 249.913488 1.791628 458.5693
4 43.366775 4.263192 1299.8190
In [133]:
ggplot(avg_df, aes(x = as.factor(Cluster), y = Recency, fill = as.factor(Cluster))) +
  geom_bar(stat = "identity", color = "black") +
  labs(x = "Cluster", y = "Recency", title = "Average Recency by Cluster") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
No description has been provided for this image
In [134]:
ggplot(avg_df, aes(x = as.factor(Cluster), y = Frequency, fill = as.factor(Cluster))) +
  geom_bar(stat = "identity", color = "black") +
  labs(x = "Cluster", y = "Frequency", title = "Average Frequency by Cluster") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
No description has been provided for this image
In [135]:
ggplot(avg_df, aes(x = as.factor(Cluster), y = Monetary, fill = as.factor(Cluster))) +
  geom_bar(stat = "identity", color = "black") +
  labs(x = "Cluster", y = "Monetary", title = "Average Monetary by Cluster") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
No description has been provided for this image
In [140]:
ggplot(rfm_df, aes(x = Recency, y = Frequency, z = Monetary, color = as.factor(Cluster))) +
  geom_point() +
  labs(title = "Scatter Plot of RFM Clusters (Individual)",
       x = "Recency", y = "Frequency", z = "Monetary") +
  theme_minimal()
No description has been provided for this image
In [142]:
head(rfm_df)
CustomerIDRecencyFrequencyMonetaryCluster
12346 326 2 0.003
12347 3 7 4310.004
12348 76 4 1437.244
12349 19 1 1457.554
12350 311 1 294.403
12352 37 8 1265.414
In [143]:
# Create scatter plot
ggplot(rfm_df, aes(x = Frequency, y = Monetary, color = as.factor(Cluster))) +
  geom_point() +
  labs(title = "Segmentation K-means",
       x = "Frequency", y = "Monetary") +
  scale_color_manual(values = c("green", "red", "cyan", "magenta")) +
  theme_minimal()
No description has been provided for this image
In [180]:
dim(rfm_df)
  1. 4362
  2. 6
In [185]:
# Calculate the first and third quartiles
Q1 <- quantile(data$Quantity, 0.05)
Q3 <- quantile(data$Quantity, 0.95)

# Calculate the IQR
IQR <- Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Filter the data to remove outliers
filtered_data <- subset(data, Quantity >= lower_bound & Quantity <= upper_bound)
In [186]:
# Calculate the first and third quartiles
Q1 <- quantile(filtered_data$UnitPrice, 0.05)
Q3 <- quantile(filtered_data$UnitPrice, 0.95)

# Calculate the IQR
IQR <- Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Filter the data to remove outliers
filtered_data <- subset(filtered_data, UnitPrice >= lower_bound & UnitPrice <= upper_bound)
In [187]:
head(filtered_data)
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountryis_Cancelledlen_StockCodeSalesMonetaryRecency
536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 2010-11-30 18:00:00 2.55 17850 United Kingdom FALSE 6 15.30 15.30 374
536365 71053 WHITE METAL LANTERN 6 2010-11-30 18:00:00 3.39 17850 United Kingdom FALSE 5 20.34 20.34 374
536365 84406B CREAM CUPID HEARTS COAT HANGER 8 2010-11-30 18:00:00 2.75 17850 United Kingdom FALSE 6 22.00 22.00 374
536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE6 2010-11-30 18:00:00 3.39 17850 United Kingdom FALSE 6 20.34 20.34 374
536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 2010-11-30 18:00:00 3.39 17850 United Kingdom FALSE 6 20.34 20.34 374
536365 22752 SET 7 BABUSHKA NESTING BOXES 2 2010-11-30 18:00:00 7.65 17850 United Kingdom FALSE 5 15.30 15.30 374
In [188]:
dim(filtered_data)
  1. 390346
  2. 13
In [189]:
# Distribution Analysis
print("\nDistribution Analysis:")
# Quantity Distribution
ggplot(filtered_data, aes(x = Quantity)) + 
  geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
  labs(title = "Quantity Distribution", x = "Quantity", y = "Frequency") +
  theme_minimal()
[1] "\nDistribution Analysis:"
No description has been provided for this image
In [190]:
ggplot(filtered_data, aes(x = UnitPrice)) + 
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(title = "UnitPrice Distribution", x = "UnitPrice", y = "Frequency") +
  theme_minimal()
No description has been provided for this image
In [191]:
print("\nTime Series Analysis:")
# Resample to monthly frequency
filtered_data$InvoiceDate <- as.Date(filtered_data$InvoiceDate)
monthly_sales <- filtered_data %>%
  group_by(month = floor_date(InvoiceDate, "month")) %>%
  summarize(total_quantity = sum(Quantity))
[1] "\nTime Series Analysis:"
In [192]:
ggplot(monthly_sales, aes(x = month, y = total_quantity)) +
  geom_line() +
  labs(title = "Monthly Sales Volume", x = "Month", y = "Total Quantity Sold") +
  theme_minimal()
No description has been provided for this image
In [193]:
# Customer Analysis
print("\nCustomer Analysis:")
# Top 10 customers by spend
top_customers <- filtered_data %>%
  group_by(CustomerID) %>%
  summarize(total_spend = sum(Quantity * UnitPrice)) %>%
  arrange(desc(total_spend)) %>%
  top_n(10)
[1] "\nCustomer Analysis:"
Selecting by total_spend
In [194]:
ggplot(top_customers, aes(x = reorder(CustomerID, -total_spend), y = total_spend)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Top 10 Customers by Spend", x = "Customer ID", y = "Total Spend") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
No description has been provided for this image
In [195]:
print("\nProduct Analysis:")
# Top 10 products by quantity sold
top_products <- filtered_data %>%
  group_by(Description) %>%
  summarize(total_quantity = sum(Quantity)) %>%
  arrange(desc(total_quantity)) %>%
  top_n(10)

ggplot(top_products, aes(x = reorder(Description, -total_quantity), y = total_quantity)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Top 10 Products by Quantity Sold", x = "Product Description", y = "Quantity Sold") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
[1] "\nProduct Analysis:"
Selecting by total_quantity
No description has been provided for this image
In [196]:
# Geographical Insights
print("\nGeographical Insights:")
# Sales by country
sales_by_country <- filtered_data %>%
  group_by(Country) %>%
  summarize(total_spend = sum(Quantity * UnitPrice)) %>%
  arrange(desc(total_spend))

ggplot(sales_by_country, aes(x = reorder(Country, -total_spend), y = total_spend)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Sales by Country", x = "Country", y = "Total Spend") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
[1] "\nGeographical Insights:"
No description has been provided for this image
In [197]:
# Sales Performance
print("\nSales Performance:")
# Monthly revenue
monthly_revenue <- filtered_data %>%
  group_by(month = floor_date(InvoiceDate, "month")) %>%
  summarize(total_revenue = sum(Quantity * UnitPrice))

ggplot(monthly_revenue, aes(x = month, y = total_revenue)) +
  geom_line() +
  labs(title = "Monthly Revenue", x = "Month", y = "Total Revenue") +
  theme_minimal()
[1] "\nSales Performance:"
No description has been provided for this image
In [198]:
# Order Analysis
print("\nOrder Analysis:")
# Number of items per order
items_per_order <- filtered_data %>%
  group_by(InvoiceNo) %>%
  summarize(total_quantity = sum(Quantity))

ggplot(items_per_order, aes(x = total_quantity)) +
  geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Items per Order", x = "Items per Order", y = "Frequency") +
  theme_minimal()
[1] "\nOrder Analysis:"
No description has been provided for this image